Install a package countrycode first.
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(WDI)
library(readxl)
library(countrycode)
World Inequality Report 2022: https://wir2022.wid.world
Methodology [Link]
Since Excel files are binary file, you need to add mode = “wb”, web binary.
url_summary <- "https://wir2022.wid.world/www-site/uploads/2022/03/WIR2022TablesFigures-Summary.xlsx"
download.file(url = url_summary, destfile = "data/WIR2022s.xlsx", mode = "wb")
library(readxl) # readxl is a part of tidyverse but not a core package
excel_sheets("data/WIR2022s.xlsx")
## [1] "Index" "F1" "F2" "F3" "F4" "F5."
## [7] "F6" "F7" "F8" "F9" "F10" "F11"
## [13] "F12" "F13" "F14" "F15" "T1" "data-F1"
## [19] "data-F2" "data-F3" "data-F4" "data-F5" "data-F6" "data-F7"
## [25] "data-F8" "data-F9" "data-F10" "data-F11" "data-F12" "data-F13."
## [31] "data-F14." "data-F15"
df_wir_f2 <- read_excel("data/WIR2022s.xlsx",
sheet = "data-F2")
df_wir_f2
Created a new Excel book with only one sheet, and save it as a CSV UTF-8.
df_wir_f2_2 <- read_csv("data/wir-f2.csv")
## Rows: 8 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): iso, Bottom 50%, Middle 40%, Top 10%
## dbl (1): year
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df_wir_f2_2
df_wir_f2_2 |> identical(df_wir_f2)
## [1] FALSE
df_wir_f2_3 <- read_delim(clipboard())
df_wir_f2_3
df_wir_f2 |> pivot_longer(3:5) |>
ggplot(aes(iso, value, fill = name)) + geom_col(position = "dodge") + coord_flip()
Popular statistical tables
Explorer - datamarts: http://data.un.org/Explorer.aspx
Copy the link of International Migrants and Refugees
un_migrants_url <- "https://data.un.org/_Docs/SYB/CSV/SYB66_327_202310_International%20Migrants%20and%20Refugees.csv"
download.file(un_migrants_url, destfile = "data/migrants.csv")
df_un_migrants <- read_csv("data/migrants.csv")
## New names:
## Rows: 7239 Columns: 7
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (7): T04, International migrants and refugees, ...3, ...4, ...5, ...6, ...7
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...3`
## • `` -> `...4`
## • `` -> `...5`
## • `` -> `...6`
## • `` -> `...7`
df_un_migrants
df_un_migrants <- read_csv("data/migrants.csv", skip=1)
## New names:
## Rows: 7238 Columns: 7
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (4): ...2, Series, Footnotes, Source dbl (2): Region/Country/Area, Year num
## (1): Value
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...2`
df_un_migrants
url_un_migrants <- "https://data.un.org/_Docs/SYB/CSV/SYB66_327_202310_International%20Migrants%20and%20Refugees.csv"
Since the url is given, a CSV file can be read directly using
read_csv.
df_un_migrants <- read_csv(url_un_migrants, skip = 1)
## New names:
## Rows: 7238 Columns: 7
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (4): ...2, Series, Footnotes, Source dbl (2): Region/Country/Area, Year num
## (1): Value
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...2`
df_un_migrants
str(df_un_migrants)
## spc_tbl_ [7,238 × 7] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Region/Country/Area: num [1:7238] 1 1 1 1 1 1 1 1 1 1 ...
## $ ...2 : chr [1:7238] "Total, all countries or areas" "Total, all countries or areas" "Total, all countries or areas" "Total, all countries or areas" ...
## $ Year : num [1:7238] 2005 2005 2005 2005 2010 ...
## $ Series : chr [1:7238] "International migrant stock: Both sexes (number)" "International migrant stock: Both sexes (% total population)" "International migrant stock: Male (% total Population)" "International migrant stock: Female (% total Population)" ...
## $ Value : num [1:7238] 1.91e+08 2.90 3.00 2.90 2.21e+08 ...
## $ Footnotes : chr [1:7238] NA NA NA NA ...
## $ Source : chr [1:7238] "United Nations Population Division, New York, International migrant stock: The 2020 Revision, last accessed January 2022." "United Nations Population Division, New York, International migrant stock: The 2020 Revision, last accessed January 2022." "United Nations Population Division, New York, International migrant stock: The 2020 Revision, last accessed January 2022." "United Nations Population Division, New York, International migrant stock: The 2020 Revision, last accessed January 2022." ...
## - attr(*, "spec")=
## .. cols(
## .. `Region/Country/Area` = col_double(),
## .. ...2 = col_character(),
## .. Year = col_double(),
## .. Series = col_character(),
## .. Value = col_number(),
## .. Footnotes = col_character(),
## .. Source = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
df_un_migrants |> summary()
## Region/Country/Area ...2 Year Series
## Min. : 1.0 Length:7238 Min. :2005 Length:7238
## 1st Qu.:178.0 Class :character 1st Qu.:2010 Class :character
## Median :404.0 Mode :character Median :2015 Mode :character
## Mean :407.1 Mean :2014
## 3rd Qu.:630.0 3rd Qu.:2020
## Max. :894.0 Max. :2022
## Value Footnotes Source
## Min. : 0 Length:7238 Length:7238
## 1st Qu.: 5 Class :character Class :character
## Median : 154 Mode :character Mode :character
## Mean : 731059
## 3rd Qu.: 31096
## Max. :280598105
df_un_migrants |> select(Year,Series) |> lapply(unique)
## $Year
## [1] 2005 2010 2015 2020 2022 2019 2018 2016 2021 2017
##
## $Series
## [1] "International migrant stock: Both sexes (number)"
## [2] "International migrant stock: Both sexes (% total population)"
## [3] "International migrant stock: Male (% total Population)"
## [4] "International migrant stock: Female (% total Population)"
## [5] "Total refugees and people in refugee-like situations (number)"
## [6] "Asylum seekers, including pending cases (number)"
## [7] "Other of concern to UNHCR (number)"
## [8] "Total population of concern to UNHCR (number)"
df_un_migrants |> distinct(`Region/Country/Area`, ...2)
library(countrycode)
df_un_migrants %>% mutate(iso2c_un = countrycode(`Region/Country/Area`, "un", "iso2c"), .after = ...2)
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `iso2c_un = countrycode(`Region/Country/Area`, "un", "iso2c")`.
## Caused by warning:
## ! Some values were not matched unambiguously: 1, 2, 5, 9, 11, 13, 14, 15, 17, 18, 21, 29, 30, 34, 35, 39, 53, 54, 57, 61, 62, 142, 143, 145, 150, 151, 154, 155, 202, 419, 420, 830
df_un_migrants |> mutate(iso2c_un = countrycode(`Region/Country/Area`, "un", "iso2c"), .after = ...2) |> filter(is.na(iso2c_un)) |> distinct(...2)
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `iso2c_un = countrycode(`Region/Country/Area`, "un", "iso2c")`.
## Caused by warning:
## ! Some values were not matched unambiguously: 1, 2, 5, 9, 11, 13, 14, 15, 17, 18, 21, 29, 30, 34, 35, 39, 53, 54, 57, 61, 62, 142, 143, 145, 150, 151, 154, 155, 202, 419, 420, 830
wdicache <- read_rds("data/wdicache.rds")
wdi_country_extra <- wdicache$country |> select(iso2c, region, income, lending)
df_un_migrants_ext <- df_un_migrants %>%
mutate(iso2c_un = countrycode(`Region/Country/Area`, "un", "iso2c"), .after = ...2) |>
drop_na(iso2c_un) |>
select(Country = ...2, ISO2C = iso2c_un, Year, Series, Value, Footnotes) |>
left_join(wdi_country_extra, by = c("ISO2C" = "iso2c"))
df_un_migrants_ext
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `iso2c_un = countrycode(`Region/Country/Area`, "un", "iso2c")`.
## Caused by warning:
## ! Some values were not matched unambiguously: 1, 2, 5, 9, 11, 13, 14, 15, 17, 18, 21, 29, 30, 34, 35, 39, 53, 54, 57, 61, 62, 142, 143, 145, 150, 151, 154, 155, 202, 419, 420, 830
df_un_migrants_ext$Series |> unique()
## [1] "International migrant stock: Both sexes (number)"
## [2] "International migrant stock: Both sexes (% total population)"
## [3] "International migrant stock: Male (% total Population)"
## [4] "International migrant stock: Female (% total Population)"
## [5] "Total refugees and people in refugee-like situations (number)"
## [6] "Asylum seekers, including pending cases (number)"
## [7] "Other of concern to UNHCR (number)"
## [8] "Total population of concern to UNHCR (number)"
df_un_migrants_ext_rev <- df_un_migrants_ext |> mutate(Ser = case_when(
Series == "International migrant stock: Both sexes (number)" ~ "migrant",
Series == "International migrant stock: Both sexes (% total population)" ~ "migrant_percent",
Series == "International migrant stock: Male (% total Population)" ~ "migrant_male",
Series == "International migrant stock: Female (% total Population)" ~ "migrant_female",
Series == "Total refugees and people in refugee-like situations (number)" ~ "refugee",
Series == "Asylum seekers, including pending cases (number)" ~ "asylum",
Series == "Other of concern to UNHCR (number)" ~ "other",
Series == "Total population of concern to UNHCR (number)" ~ "concern",
TRUE ~ Series), .before = Series)
df_un_migrants_ext_rev
dput(df_un_migrants_ext_rev)
Environment Statistics Database>Water>Internal flow [Link]
Select columns to add Table ID and Country and Area Code of UN, and download ‘Comma’ a CSV.
df_un_water <- read_csv("data/UNdata_Export_20240124_010609671.csv")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 1604 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): Country or Area Code, Country or Area, Unit
## dbl (4): Table ID, Year, Value, Value Footnotes
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df_un_water
OECD data https://data.oecd.org/
There is a newly developed Database Access linked above. However, it is still under development and difficult to handle data there.
Permanent immigrant inflows Total, Number, 2022 Link
Definition of Permanent immigrant inflows
library(htmltools)
iframe_code <- '<iframe src="https://data.oecd.org/chart/7kgk" width="860" height="645" style="border: 0" mozallowfullscreen="true" webkitallowfullscreen="true" allowfullscreen="true"><a href="https://data.oecd.org/chart/7kgk" target="_blank">OECD Chart: Permanent immigrant inflows, Total, Number, Annual, 2022</a></iframe>'
HTML(iframe_code)
[Link]
Table in Excel, Filtered data in tableau text (CSV), Unfiltered data in tableau text (CSV) 136.4MB
temp <- read_csv("../../../bigdata/OECD.EDU.IMEP,DSD_EAG_UOE_FIN@DF_UOE_INDIC_FIN_PERSTUD,1.0+all.csv")
temp
library(readxl)
df_oecd_ed <- read_excel("data/OECD.EDU.IMEP,DSD_EAG_UOE_FIN@DF_UOE_INDIC_FIN_PERSTUD,1.0,filtered,2024-01-24 10-22-31.xlsx")
## New names:
## • `` -> `...2`
## • `` -> `...3`
## • `` -> `...4`
## • `` -> `...5`
## • `` -> `...6`
## • `` -> `...7`
## • `` -> `...8`
## • `` -> `...9`
## • `` -> `...10`
## • `` -> `...11`
## • `` -> `...12`
## • `` -> `...13`
df_oecd_ed
df_oecd_ed <- read_excel("data/OECD.EDU.IMEP,DSD_EAG_UOE_FIN@DF_UOE_INDIC_FIN_PERSTUD,1.0,filtered,2024-01-24 10-22-31.xlsx", skip = 6)
## New names:
## • `Education level` -> `Education level...1`
## • `Education level` -> `Education level...2`
## • `` -> `...13`
df_oecd_ed
colnames(df_oecd_ed)
## [1] "Education level...1"
## [2] "Education level...2"
## [3] "Primary to tertiary education"
## [4] "Primary education"
## [5] "Lower secondary education"
## [6] "Upper secondary education"
## [7] "· \nUpper secondary general education"
## [8] "· \nUpper secondary vocational education"
## [9] "Post-secondary non-tertiary education"
## [10] "Tertiary education"
## [11] "· \nShort-cycle tertiary education"
## [12] "· \nBachelor's, Master's and Doctoral or equivalent level"
## [13] "...13"
df_oecd_ed_short <- df_oecd_ed |> select(1,3,4,5,6,9,10) |> drop_na(`Primary education`)
df_oecd_ed_short
df_oecd_ed2 <- read_csv("data/OECD.EDU.IMEP,DSD_EAG_UOE_FIN@DF_UOE_INDIC_FIN_PERSTUD,1.0+..ISCED11_1+ISCED11_2+ISCED11_3+ISCED11_4+ISCED11_5+ISCED11_5T8+ISCED11_6T8+ISCED11_1T8._T.INST_EDU.DIR_EXP...csv")
## New names:
## Rows: 816 Columns: 32
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (12): STRUCTURE, STRUCTURE_ID, ACTION, MEASURE, REF_AREA, EDUCATION_LEV,... dbl
## (4): TIME_PERIOD, OBS_VALUE, UNIT_MULT, DECIMALS lgl (16): STRUCTURE_NAME,
## ...6, ...8, ...10, ...12, ...14, ...16, ...18, ......
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...6`
## • `` -> `...8`
## • `` -> `...10`
## • `` -> `...12`
## • `` -> `...14`
## • `` -> `...16`
## • `` -> `...18`
## • `` -> `...20`
## • `` -> `...22`
## • `` -> `...24`
## • `` -> `...26`
## • `` -> `...28`
## • `` -> `...30`
## • `` -> `...32`
df_oecd_ed2
Data Analysis for Researchers. [Link]
Posit Recipes(New edition of Posit Primers): interactive exercises [Link]
Cheat Sheet. [Site Link]
dplyr: A Grammar of Data Manipulation [Link]
Model Summary
r-statistics.co by Selva Prabhakaran:
Meaning Behind Each Section of Summary()
swirl: install.packages("swirl")
libray(swirl), and swirl() after
rm(list=ls())